data <- readRDS("~/peregrine_amazon/data/annual/aad_2021_forests.rds") %>%
# select only general and forest variables
select(Code:Population, min_Precip:max_Precip_Month,
forest_density, SWChange_abs:lat, -AvgRad, -StableLights) %>%
mutate(CL = ifelse(CL > 0, 1, 0) %>%
as.factor()) %>%
filter(!is.na(CL),
!is.na(forest_density)) %>%
mutate(Country = as.factor(Country))
predictor_names <- data %>%
select(-c(Code, Name, Country, CL, fold)) %>%
names()
We try different approaches to troubleshoot problems that we may encounter. The basic structure of each approach is simple. Use a spatiotemporal cross validation approach to test the model on a linear regression model. For every spatiotemporal fold \(ij\), we use all data outside of the fold \(ij\) as the training data and test it on the data inside of the \(ij\) fold. Then we gather the optimal threshold using a ROAUC curve on some validation data and finally test it on the untouched testing data.
Below, we just create a model on all the data without any spatiotemporal cross validation to see the top explanatory variables and compare with each approach.
set.seed(123)
full.predictor_names <- data %>%
select(-c(Name, Country, Code, CL, fold)) %>%
names()
full.ml_formula <- as.formula(paste0("CL ~", paste(full.predictor_names, collapse=" + ")))
# get testing and training split
split <- initial_split(data, strata = CL)
full.datrain <- training(split)
full.datest <- testing(split)
# get model
full.mod <- glm(full.ml_formula, data=full.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on full.datest
full.oob <- predict(full.mod, newdata=full.datest, type='response', na.action = na.pass)
full.temp_auc_out <- pROC::roc(response = full.datest$CL, predictor= full.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
full.best_threshold_out <- pROC::coords(full.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
full.metrica.format <- data.frame(
cbind(ifelse(full.datest$CL==1, 1, 0),
ifelse(full.oob>=full.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(full.metrica.format) <- c("labels","predictions")
rownames(full.metrica.format) <- 1:dim(full.metrica.format)[1]
full.auc_out <- pROC::roc(response = full.metrica.format$labels, predictor= full.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
full.metrica.format.confusionMatrix <- full.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=full.metrica.format$labels)
full.sensitivity_out <- caret::recall(
data = full.metrica.format.confusionMatrix$table,
relevant = rownames(full.metrica.format.confusionMatrix$table)[2]
)
full.specificity_out <- caret::precision(
data = full.metrica.format.confusionMatrix$table,
relevant = rownames(full.metrica.format.confusionMatrix$table)[2]
)
full.var_imp_vars <- vip::vip(full.mod, num_features=20)
full.var_imp_vars
It seems that the socioeconomic and elevation variables sit at the top of the importance plot, as well as temperature and precipitation. We should take this with a grain of salt as it’s not terribly rigorous.
Now we attempt to run a bare-bones spatiotemporal cross validation
approach. Here, we make sure that each testing fold analyzed has both
present and absent points. Then we take and split the training data to
create a validation set for model tuning in order to save the testing
set datest for the end. Testing on the
datrain.test data, we get a set of most important
predictors and optimal threshold to use on datest. We
finally save the metrics to aggregate later before moving onto the next
spatiotemporal fold.
set.seed(123)
for(j in data$Year %>% unique() %>% sort()){
for(i in data$fold %>% unique() %>% sort()){
print(paste(i,j, sep="_"))
# first split using i and j
datrain <- data %>%
filter(Year != j,
fold != i)
datest <- data %>%
filter(Year == j,
fold == i)
# skip fold if there is a complete imbalance of present vs. absent CL
if(datest$CL %>% unique() %>% length() != 2){next}
# second split (on datrain) before testing on datest for further validation
datrain.split <- initial_split(datrain)
datrain.train <- training(datrain.split)
datrain.test <- testing(datrain.split)
# get initial model
init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
# calculate out of sample model performance on datrain.test
oob.datrain.test <- predict(full.mod, newdata=datrain.test, type='response')
auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datrain.test <- data.frame(
cbind(ifelse(datrain.test$CL==1, 1, 0),
ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datrain.test) <- c("labels","predictions")
rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
sensitivity_out.datrain.test <- caret::recall(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
specificity_out.datrain.test <- caret::precision(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable
# for retrieval later
datrain.test.metrics <- list(
auc = auc_out.datrain.test,
best_threshold = best_threshold_out.datrain.test,
confusion_matrix = metrica.format.datrain.test,
sensitivity = sensitivity_out.datrain.test,
specificity = specificity_out.datrain.test,
imp_vars = var_imp_vars.datrain.test)
edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
## second model on all of datrain
edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
oob.datest <- predict(edited.mod, newdata=datest, type='response')
auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datest <- data.frame(
cbind(ifelse(datest$CL==1, 1, 0),
ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datest) <- c("labels","predictions")
rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
metrica.format.datest <- metrica.format.datest$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datest$labels)
sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
specificity_out.datest <- caret::precision(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
# for retrieval later
datest.metrics <- list(
auc = auc_out.datest,
best_threshold = best_threshold_out.datest,
confusion_matrix = metrica.format.datest,
sensitivity = sensitivity_out.datest,
specificity = specificity_out.datest,
imp_vars = var_imp_vars.datest)
saveRDS(datrain.test.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datrain.test/", i, "_", j, "_metrics.rds"))
saveRDS(datest.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models/", i, "_", j, "_edited_mod.rds"))
}
}
For the most part, our code does not change. But first, we need to
redefine the data that we partition into datrain and
datest for each spatiotemporal fold:
set.seed(123)
# create holdout set
balanced.holdout.split <- initial_split(data, strata=fold)
balanced.holdout.train <- training(balanced.holdout.split)
balanced.holdout.test <- testing(balanced.holdout.split)
balanced_data_v1 <- balanced.holdout.train %>%
filter(CL == 1) %>%
slice_sample(n = nrow(balanced.holdout.train %>% filter(CL == 0))) %>%
rbind(balanced.holdout.train %>% filter(CL == 0))
dim(data)
## [1] 26213 41
dim(balanced_data_v1)
## [1] 11048 41
Compare the dimensions of the full data and the balanced data. The balanced training data is almost a third of the full data! The following code block uses the same structure as the one above using the full data.
resume_year = 2001
for(j in resume_year:max(balanced_data_v1$Year)){
for(i in balanced_data_v1$fold %>% unique() %>% sort()){
print(paste(i,j, sep="_"))
# first split using i and j
datrain <- balanced_data_v1 %>%
filter(Year != j,
fold != i)
datest <- balanced_data_v1 %>%
filter(Year == j,
fold == i)
# if the data is too imbalanced, then we skip to the next iteration i_j
if(datest$CL %>% unique() %>% length() == 1) {next}
# second split (on datrain) before testing on datest
datrain.split <- initial_split(datrain)
datrain.train <- training(datrain.split)
datrain.test <- testing(datrain.split)
# get initial model
init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
# calculate out of sample model performance on datrain.test
oob.datrain.test <- predict(init.mod, newdata=datrain.test, type='response')
auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datrain.test <- data.frame(
cbind(ifelse(datrain.test$CL==1, 1, 0),
ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datrain.test) <- c("labels","predictions")
rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
sensitivity_out.datrain.test <- caret::recall(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
specificity_out.datrain.test <- caret::precision(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable
# for retrieval later
datrain.test.metrics <- list(
auc = auc_out.datrain.test,
best_threshold = best_threshold_out.datrain.test,
confusion_matrix = metrica.format.datrain.test,
sensitivity = sensitivity_out.datrain.test,
specificity = specificity_out.datrain.test,
imp_vars = var_imp_vars.datrain.test)
edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
## second model on all of datrain
edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
oob.datest <- predict(edited.mod, newdata=datest, type='response')
auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datest <- data.frame(
cbind(ifelse(datest$CL==1, 1, 0),
ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datest) <- c("labels","predictions")
rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
metrica.format.datest <- metrica.format.datest$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datest$labels)
sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
specificity_out.datest <- caret::precision(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
# for retrieval later
datest.metrics <- list(
auc = auc_out.datest,
best_threshold = best_threshold_out.datest,
confusion_matrix = metrica.format.datest,
sensitivity = sensitivity_out.datest,
specificity = specificity_out.datest,
imp_vars = var_imp_vars.datest)
saveRDS(datrain.test.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datrain.test/", i, "_", j, "_metrics.rds"))
saveRDS(datest.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models/", i, "_", j, "_edited_mod.rds"))
}
}
Maybe we can further choose a better resampling method since we skip a lot of iterations.
What if we go into each spatiotemporal fold \(ij\) and take $n_{ij} = $ number of observations with absent incidence in that spatiotemporal fold? Note that \(n_{ij} \geq 0\).
set.seed(123)
balanced_data_v2 <- data.frame()
for(j in balanced.holdout.train$Year %>% unique() %>% sort()){
for(i in balanced.holdout.train$fold %>% unique() %>% sort()){
# get ij fold subset
data_ij <- balanced.holdout.train %>%
filter(Year == j,
fold == i)
# get subset of ij fold subset of only observations with no CL occurrence
data_ij_absent <- data_ij %>%
filter(CL == 0)
# get the length to use when sampling to balance the data set
n_ij <- nrow(data_ij_absent)
if(n_ij == 0){next}
# sample n_ij positive occurrence observations in ij fold
n_data_ij_positive <- data_ij %>%
filter(CL == 1) %>%
nrow()
if(n_data_ij_positive == 0){next}
n_ij <- min(n_ij, n_data_ij_positive)
data_ij_positive_sample <- data_ij %>%
filter(CL == 1) %>%
slice_sample(n=n_ij)
data_ij_sample <- data_ij_positive_sample %>%
rbind(data_ij_absent) # combine with subset with no CL occurrence
balanced_data_v2 <- balanced_data_v2 %>%
rbind(data_ij_sample) # combine with main df balance_data_v2
print(paste(j, i, n_ij, nrow(data_ij_positive_sample)))
}
}
## [1] "2001 1 8 8"
## [1] "2001 2 13 13"
## [1] "2001 3 117 117"
## [1] "2002 1 4 4"
## [1] "2002 2 6 6"
## [1] "2002 3 108 108"
## [1] "2003 1 3 3"
## [1] "2003 2 3 3"
## [1] "2003 3 111 111"
## [1] "2004 1 5 5"
## [1] "2004 2 7 7"
## [1] "2004 3 126 126"
## [1] "2005 1 3 3"
## [1] "2005 2 4 4"
## [1] "2005 3 100 100"
## [1] "2006 1 3 3"
## [1] "2006 2 1 1"
## [1] "2006 3 115 115"
## [1] "2007 1 33 33"
## [1] "2008 1 22 22"
## [1] "2009 1 23 23"
## [1] "2009 2 8 8"
## [1] "2009 3 135 135"
## [1] "2010 1 380 380"
## [1] "2011 1 373 373"
## [1] "2012 1 359 359"
## [1] "2013 1 351 351"
## [1] "2014 1 370 370"
## [1] "2015 1 331 331"
## [1] "2016 1 35 35"
## [1] "2016 2 8 8"
## [1] "2016 3 186 186"
## [1] "2017 1 28 28"
## [1] "2017 2 5 5"
## [1] "2017 3 182 182"
## [1] "2018 1 23 23"
## [1] "2018 2 8 8"
## [1] "2018 3 169 169"
## [1] "2019 1 28 28"
## [1] "2019 2 8 8"
## [1] "2019 3 154 154"
## [1] "2020 1 28 28"
## [1] "2020 2 7 7"
## [1] "2020 3 145 145"
Let’s save this balanced data for later. For now, we get our ideal
threshold and predictor variables from balanced_data_v1. To
get our threshold, we just take the mean of our thresholds provided by
each fold. To get our predictor variables, we count each time the
predictor variable appears in all folds.
# create empty vectors to be filled
threshold <- c()
imp.vars <- c()
auc <- c()
p_val <- c()
resume_year <- 2001
for(j in resume_year:max(balanced_data_v1$Year)){
for(i in balanced_data_v1$fold %>% unique() %>% sort()){
# make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))) {next}
this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
threshold <- c(threshold, this.metrics$best_threshold[[1]])
imp.vars <- c(imp.vars, this.metrics$imp_vars)
auc <- c(auc, this.metrics$auc$auc[[1]])
p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
}
}
# get mean threshold
mean.threshold <- mean(threshold)
# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
# get count of each variable to rank general importance to apply to main data
v1.imp.vars.count <- imp.vars.df %>%
count(var) %>%
arrange(desc(n))
# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)
metrics_df.balanced_v1 <- data.frame(
metric = c('AUC', 'Accuracy P-value', 'Threshold'),
value = c(mean.auc, mean.p_val, mean.threshold)
)
metrics_df.balanced_v1
## metric value
## 1 AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3 Threshold 0.8011194
This approach gives a pretty high accuracy p-value (0.52) but decent AUC value (0.76).
We can probably do better by weighting the threshold by a performance
metric like no information rate (NIR) or P-value [Acc > NIR]
(this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]).
This method by itself can be improved by giving a weight, perhaps weighting by order of importance in each fold or a performance metric. (See above.)
For now, we can carry on with this approach and see how well it performs. We choose the 10 most popular predictor variables to use in our formula and train a model on the testing data. [TODO: continue]
#### change v1.datrain/v1.datest -> balanced.holdout.train/v1.datest
set.seed(123)
# choose the first 10 predictor variables
v1.imp_vars <- v1.imp.vars.count %>%
select(var) %>%
filter(row_number() <= 10) %>%
mutate(var = as.character(var))
# get our predictor names into vector format but make sure to include the forest variables
v1.predictor_names <- c(v1.imp_vars$var, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
# get our threshold
v1.threshold <- metrics_df.balanced_v1 %>%
filter(metric=='Threshold') %>%
select(value) %>%
as.numeric()
# make the formula based on our chosen predictors
v1.ml_formula <- as.formula(paste0("CL ~", paste(v1.predictor_names, collapse=" + ")))
# get testing and training split
# split <- initial_split(data, strata = CL)
# v1.datrain <- training(split)
# v1.datest <- testing(split)
v1.datrain <- balanced.holdout.train
v1.datest <- balanced.holdout.test
# get model
v1.mod <- glm(v1.ml_formula, data=v1.datrain, family=binomial) ##
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on v1.datest
v1.oob <- predict(v1.mod, newdata=v1.datest, type='response', na.action = na.pass)
v1.temp_auc_out <- pROC::roc(response = v1.datest$CL, predictor= v1.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v1.best_threshold_out <- pROC::coords(v1.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
v1.metrica.format <- data.frame(
cbind(ifelse(v1.datest$CL==1, 1, 0),
ifelse(v1.oob>=v1.best_threshold_out[1,1], 1, 0))) %>% # v1.best_threshold_out[1,1] -> 0.8243;
mutate_all(as.factor)
colnames(v1.metrica.format) <- c("labels","predictions")
rownames(v1.metrica.format) <- 1:dim(v1.metrica.format)[1]
v1.auc_out <- pROC::roc(response = v1.metrica.format$labels, predictor= v1.metrica.format$predictions %>% as.ordered(), levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v1.metrica.format.confusionMatrix <- v1.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=v1.metrica.format$labels)
v1.sensitivity_out <- caret::recall(
data = v1.metrica.format.confusionMatrix$table,
relevant = rownames(v1.metrica.format.confusionMatrix$table)[2]
)
v1.specificity_out <- caret::precision(
data = v1.metrica.format.confusionMatrix$table,
relevant = rownames(v1.metrica.format.confusionMatrix$table)[2]
)
v1.var_imp_vars <- vip::vip(v1.mod, num_features = 20)
v1.var_imp_vars
The top variables are precipitation, elevation, and socioeconomic again.
v1.partial.forest_density <- pdp::partial(v1.mod, pred.var='forest_density')
pdp::plotPartial(v1.partial.forest_density)
v1.partial.forest_fragmentation <- pdp::partial(v1.mod, pred.var='forest_fragmentation')
pdp::plotPartial(v1.partial.forest_fragmentation)
v1.partial.land_use_change <- pdp::partial(v1.mod, pred.var='land_use_change')
pdp::plotPartial(v1.partial.land_use_change)
v1.partial.edge_loss <- pdp::partial(v1.mod, pred.var='edge_loss')
pdp::plotPartial(v1.partial.edge_loss)
v1.partial.Tair_f_tavg <- pdp::partial(v1.mod, pred.var='Tair_f_tavg')
pdp::plotPartial(v1.partial.Tair_f_tavg)
v1.partial.min_Precip <- pdp::partial(v1.mod, pred.var='min_Precip')
pdp::plotPartial(v1.partial.min_Precip)
summary(v1.mod)
##
## Call:
## glm(formula = v1.ml_formula, family = binomial, data = v1.datrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.2569 -0.2023 0.3611 0.6041 4.3589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.381e+01 9.227e+00 -1.496 0.134620
## min_Elevation -1.456e-03 7.708e-05 -18.883 < 2e-16 ***
## Population 4.606e-05 2.081e-06 22.130 < 2e-16 ***
## min_Precip -1.971e-02 8.266e-04 -23.843 < 2e-16 ***
## forest_density 1.051e-02 1.191e-03 8.826 < 2e-16 ***
## HNTL -1.413e-01 8.798e-03 -16.059 < 2e-16 ***
## Wind_f_tavg -1.921e-01 3.164e-02 -6.070 1.28e-09 ***
## max_Elevation 2.279e-05 5.490e-05 0.415 0.678062
## LST_Night 6.584e-02 1.592e-02 4.137 3.52e-05 ***
## long -3.964e-07 3.462e-08 -11.450 < 2e-16 ***
## Year 6.149e-03 4.617e-03 1.332 0.182915
## forest_fragmentation 6.327e-07 6.725e-07 0.941 0.346839
## land_use_change 7.694e-03 2.953e-02 0.261 0.794467
## edge_loss 5.090e-07 1.470e-07 3.461 0.000538 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23350 on 19657 degrees of freedom
## Residual deviance: 14279 on 19644 degrees of freedom
## AIC: 14307
##
## Number of Fisher Scoring iterations: 7
This does not seem to give much deeper insight. We investigate the incorrectly classified observations:
v1.incorrect <- balanced.holdout.test %>%
mutate(.pred = v1.metrica.format$predictions) %>%
mutate(correct = .pred==CL) %>%
select(Code, Name, Country, Year, CL, correct, everything())
v1.incorrect %>% summary() # there is a 1787 to 4768 split between true CL absent vs. present
## Code Name Country Year CL
## Min. : 10101 Length:6555 Brazil :4272 Min. :2001 0:1787
## 1st Qu.: 120608 Class :character Colombia: 317 1st Qu.:2008 1:4768
## Median :1503457 Mode :character Peru :1966 Median :2012
## Mean :1783845 Mean :2012
## 3rd Qu.:2107506 3rd Qu.:2015
## Max. :5300108 Max. :2020
##
## correct NDVI EVI LST_Day
## Mode :logical Min. :0.1767 Min. :0.1016 Min. :11.77
## FALSE:1198 1st Qu.:0.5453 1st Qu.:0.3354 1st Qu.:25.95
## TRUE :5357 Median :0.6172 Median :0.3995 Median :28.82
## Mean :0.5928 Mean :0.3809 Mean :27.89
## 3rd Qu.:0.6722 3rd Qu.:0.4507 3rd Qu.:31.27
## Max. :0.8208 Max. :0.5459 Max. :36.71
##
## min_LST_Day max_LST_Day LST_Night min_LST_Night
## Min. :-8.793 Min. :18.22 Min. :-5.268 Min. :-37.750
## 1st Qu.:17.961 1st Qu.:31.03 1st Qu.:15.358 1st Qu.: 5.081
## Median :23.644 Median :34.80 Median :20.881 Median : 15.916
## Mean :20.851 Mean :34.77 Mean :17.029 Mean : 10.702
## 3rd Qu.:25.447 3rd Qu.:38.58 3rd Qu.:21.779 3rd Qu.: 18.420
## Max. :31.086 Max. :48.22 Max. :24.953 Max. : 22.886
##
## max_LST_Night HNTL Precip Evap_tavg
## Min. :-1.856 Min. : 0.0000 Min. : 184 Min. :1.718e-06
## 1st Qu.:19.443 1st Qu.: 0.1043 1st Qu.:1179 1st Qu.:3.313e-05
## Median :23.256 Median : 0.5435 Median :1636 Median :4.163e-05
## Mean :19.940 Mean : 1.9367 Mean :1702 Mean :3.879e-05
## 3rd Qu.:24.188 3rd Qu.: 2.3949 3rd Qu.:2102 3rd Qu.:4.581e-05
## Max. :28.188 Max. :60.8290 Max. :4831 Max. :6.252e-05
##
## Qair_f_tavg SoilMoi00_10cm_tavg Wind_f_tavg Tair_f_tavg
## Min. :0.004301 Min. :0.1988 Min. :1.412 Min. :273.6
## 1st Qu.:0.011649 1st Qu.:0.3041 1st Qu.:2.354 1st Qu.:294.6
## Median :0.014090 Median :0.3297 Median :3.111 Median :298.9
## Mean :0.013457 Mean :0.3339 Mean :3.169 Mean :295.3
## 3rd Qu.:0.016298 3rd Qu.:0.3618 3rd Qu.:3.874 3rd Qu.:299.8
## Max. :0.018958 Max. :0.4281 Max. :6.758 Max. :302.2
##
## Population min_Precip max_Precip min_Precip_Month
## Min. : 111.6 Min. : 0.0000 Min. : 31.54 Min. : 1.000
## 1st Qu.: 3875.6 1st Qu.: 0.3931 1st Qu.:243.89 1st Qu.: 6.000
## Median : 8904.1 Median : 6.9627 Median :331.79 Median : 7.000
## Mean : 24261.9 Mean : 19.8774 Mean :329.78 Mean : 7.245
## 3rd Qu.: 19207.4 3rd Qu.: 22.2811 3rd Qu.:403.59 3rd Qu.: 8.000
## Max. :2791764.8 Max. :281.0092 Max. :949.69 Max. :12.000
##
## max_Precip_Month forest_density SWChange_abs SWChange_norm
## Min. : 1.000 Min. : 0.00 Min. :-128.0000 Min. :-128.000
## 1st Qu.: 2.000 1st Qu.: 7.28 1st Qu.: -3.6756 1st Qu.: 1.579
## Median : 3.000 Median :23.04 Median : 0.8671 Median : 15.067
## Mean : 4.149 Mean :33.27 Mean : 2.6775 Mean : 21.892
## 3rd Qu.: 5.000 3rd Qu.:57.09 3rd Qu.: 8.1409 3rd Qu.: 39.375
## Max. :12.000 Max. :99.65 Max. : 84.6450 Max. : 100.000
## NA's :85 NA's :85
## SWOccurrence SWRecurrence SWSeasonality max_Elevation
## Min. : 0.00 Min. : 24.00 Min. : 1.000 Min. : 29
## 1st Qu.:41.24 1st Qu.: 77.28 1st Qu.: 5.324 1st Qu.: 349
## Median :60.27 Median : 85.10 Median : 7.271 Median : 646
## Mean :58.09 Mean : 83.49 Mean : 7.172 Mean :1528
## 3rd Qu.:76.43 3rd Qu.: 91.27 3rd Qu.: 9.103 3rd Qu.:2763
## Max. :98.41 Max. :100.00 Max. :11.908 Max. :6544
## NA's :85 NA's :85 NA's :112
## mean_Elevation min_Elevation var_Elevation fold
## Min. : 3.383 Min. : -3.0 Min. : 10.9 Min. :1.000
## 1st Qu.: 170.384 1st Qu.: 82.0 1st Qu.: 901.2 1st Qu.:1.000
## Median : 320.461 Median : 199.0 Median : 4683.5 Median :3.000
## Mean :1006.483 Mean : 631.1 Mean : 89736.7 Mean :2.126
## 3rd Qu.:1125.551 3rd Qu.: 544.0 3rd Qu.: 69556.4 3rd Qu.:3.000
## Max. :4814.186 Max. :4350.0 Max. :2298170.4 Max. :3.000
##
## forest_fragmentation land_use_change edge_loss long
## Min. : 0.0 Min. :-7.70056 Min. :-2677290 Min. :-8863907
## 1st Qu.: 686.8 1st Qu.:-0.25048 1st Qu.: -13590 1st Qu.:-8226324
## Median : 1724.1 Median :-0.01798 Median : 480 Median :-6295076
## Mean : 14365.0 Mean :-0.17601 Mean : 27872 Mean :-6679746
## 3rd Qu.: 5751.9 3rd Qu.: 0.03911 3rd Qu.: 29940 3rd Qu.:-5380194
## Max. :750054.3 Max. : 6.55408 Max. : 4414320 Max. :-4851417
##
## lat .pred
## Min. :-2124011 0:2057
## 1st Qu.:-1435452 1:4498
## Median : -952690
## Mean : -934598
## 3rd Qu.: -506889
## Max. : 649268
##
# plots
v1.incorrect.important_vars <- v1.incorrect %>%
select(Code, Name, Year, CL, correct, v1.imp_vars$var) %>%
mutate(log_Population = log(Population)) %>%
select(-Population) %>%
pivot_longer(cols = c(min_Elevation:log_Population))
ggplot(v1.incorrect.important_vars) +
geom_boxplot(aes(x = correct,
y = value)) +
facet_wrap(facets='name', scales='free')
ggplot(v1.incorrect.important_vars) +
geom_density(aes(x = value,
col = correct)) +
facet_wrap(facets='name', scales='free')
The main takeaways here are that incorrectly identified data seem to have lower forest density, lower population, higher wind speeds than correctly identified data.
We go on now to use the balanced_data_v2 data set.
set.seed(123)
current_j = 2001
for(j in current_j:max(balanced_data_v2$Year)){
for(i in balanced_data_v2$fold %>% unique() %>% sort()){
print(paste(i,j, sep="_"))
# first split using i and j
datrain <- balanced_data_v2 %>%
filter(Year != j,
fold != i)
datest <- balanced_data_v2 %>%
filter(Year == j,
fold == i)
# if the data is too imbalanced, then we skip to the next iteration i_j
if(datest$CL %>% unique() %>% length() == 1) {next}
if(nrow(datest) == 0) {next}
# second split (on datrain) before testing on datest
datrain.split <- initial_split(datrain)
datrain.train <- training(datrain.split)
datrain.test <- testing(datrain.split)
# get initial model
init.mod <- glm(full.ml_formula, data=datrain.train, family=binomial)
# calculate out of sample model performance on datrain.test
oob.datrain.test <- predict(init.mod, newdata=datrain.test, type='response')
auc_out.datrain.test <- pROC::roc(response = datrain.test$CL, predictor= oob.datrain.test, levels=c(0,1), auc = TRUE)
best_threshold_out.datrain.test <- pROC::coords(auc_out.datrain.test, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datrain.test <- data.frame(
cbind(ifelse(datrain.test$CL==1, 1, 0),
ifelse(oob.datrain.test>=best_threshold_out.datrain.test[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datrain.test) <- c("labels","predictions")
rownames(metrica.format.datrain.test) <- 1:dim(metrica.format.datrain.test)[1]
metrica.format.datrain.test <- metrica.format.datrain.test$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datrain.test$labels)
sensitivity_out.datrain.test <- caret::recall(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
specificity_out.datrain.test <- caret::precision(
data = metrica.format.datrain.test$table,
relevant = rownames(metrica.format.datrain.test$table)[2]
)
var_imp_vars.datrain.test <- vip::vip(init.mod)$data$Variable # picks top ten features
# for retrieval later
datrain.test.metrics <- list(
auc = auc_out.datrain.test,
best_threshold = best_threshold_out.datrain.test,
confusion_matrix = metrica.format.datrain.test,
sensitivity = sensitivity_out.datrain.test,
specificity = specificity_out.datrain.test,
imp_vars = var_imp_vars.datrain.test
)
edited.predictor_names <- c(datrain.test.metrics$imp_vars, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss")
## second model on all of datrain
edited.ml_formula <- as.formula(paste0("CL ~ ", paste(edited.predictor_names, collapse="+")))
edited.mod <- glm(edited.ml_formula, data=datrain, family=binomial)
oob.datest <- predict(edited.mod, newdata=datest, type='response')
auc_out.datest <- pROC::roc(response = datest$CL, predictor= oob.datest, levels=c(0,1), auc = TRUE)
best_threshold_out.datest <- pROC::coords(auc_out.datest, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
metrica.format.datest <- data.frame(
cbind(ifelse(datest$CL==1, 1, 0),
ifelse(oob.datest>=best_threshold_out.datest[1,1], 1, 0))) %>%
mutate_all(as.factor)
colnames(metrica.format.datest) <- c("labels","predictions")
rownames(metrica.format.datest) <- 1:dim(metrica.format.datest)[1]
metrica.format.datest <- metrica.format.datest$predictions %>%
confusionMatrix(positive='1', reference=metrica.format.datest$labels)
sensitivity_out.datest <- caret::recall(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
specificity_out.datest <- caret::precision(data = metrica.format.datest$table,
relevant = rownames(metrica.format.datest$table)[2])
var_imp_vars.datest <- vip::vip(edited.mod)$data$Variable
# for retrieval later
datest.metrics <- list(
auc = auc_out.datest,
best_threshold = best_threshold_out.datest,
confusion_matrix = metrica.format.datest,
sensitivity = sensitivity_out.datest,
specificity = specificity_out.datest,
imp_vars = var_imp_vars.datest)
saveRDS(datrain.test.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datrain.test/", i, "_", j, "_metrics.rds"))
saveRDS(datest.metrics,
paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
saveRDS(edited.mod, paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/models.v2/", i, "_", j, "_edited_mod.rds"))
}
}
Using the above models, let’s find a good final threshold and
variables to include in the final model, like we did for
balanced_data_v1.
# create empty vectors to be filled
threshold <- c()
imp.vars <- c()
auc <- c()
p_val <- c()
for(j in resume_year:max(balanced_data_v2$Year)){
for(i in balanced_data_v2$fold %>% unique() %>% sort()){
# make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))) {next}
this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
threshold <- c(threshold, this.metrics$best_threshold[[1]])
imp.vars <- c(imp.vars, this.metrics$imp_vars)
auc <- c(auc, this.metrics$auc$auc[[1]])
p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
}
}
# get mean threshold
mean.threshold <- mean(threshold)
# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
# get count of each variable to rank general importance to apply to main data
v2.imp.vars.count <- imp.vars.df %>%
count(var) %>%
arrange(desc(n))
# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)
metrics_df.balanced_v2 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v2
## metric value
## 1 AUC 0.7526243
## 2 Accuracy P-value 0.1034658
## 3 Threshold 0.2897455
We compare this to the results of
metrics_df.balanced_v1:
metrics_df.balanced_v1
## metric value
## 1 AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3 Threshold 0.8011194
# change meaning of v2.datrain and v2.datest to balanced.holdout.train and balanced.holdout.test
set.seed(123)
v2.imp_vars <- v2.imp.vars.count %>%
select(var) %>%
filter(row_number() <= 10) %>%
mutate(var = as.character(var))
v2.predictor_names <- c(v2.imp_vars$var, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
v2.threshold <- metrics_df.balanced_v2 %>% filter(metric=='Threshold') %>% select(value) %>% as.numeric()
v2.ml_formula <- as.formula(paste0("CL ~", paste(v2.predictor_names, collapse=" + ")))
# get testing and training split
split <- initial_split(data, strata = CL)
# v2.datrain <- training(split)
# v2.datest <- testing(split)
v2.datrain <- balanced.holdout.train
v2.datest <- balanced.holdout.test
# get model
v2.mod <- glm(v2.ml_formula, data=v2.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on v2.datest
v2.oob <- predict(v2.mod, newdata=v2.datest, type='response', na.action = na.pass)
v2.temp_auc_out <- pROC::roc(response = v2.datest$CL, predictor= v2.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v2.best_threshold_out <- pROC::coords(v2.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
v2.metrica.format <- data.frame(
cbind(ifelse(v2.datest$CL==1, 1, 0),
ifelse(v2.oob>=v2.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(v2.metrica.format) <- c("labels","predictions")
rownames(v2.metrica.format) <- 1:dim(v2.metrica.format)[1]
v2.auc_out <- pROC::roc(response = v2.metrica.format$labels,
predictor = v2.metrica.format$predictions %>% as.numeric() - 1,
levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
v2.metrica.format.confusionMatrix <- v2.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=v2.metrica.format$labels)
v2.sensitivity_out <- caret::recall(data = v2.metrica.format.confusionMatrix$table,
relevant = rownames(v2.metrica.format.confusionMatrix$table)[2])
v2.specificity_out <- caret::precision(data = v2.metrica.format.confusionMatrix$table,
relevant = rownames(v2.metrica.format.confusionMatrix$table)[2])
v2.var_imp_vars <- vip::vip(v2.mod, num_features=20)
v2.var_imp_vars
The variable importance plot looks very similar to that of
balanced_data_v1.
v1.var_imp_vars
v2.incorrect <- v2.datest %>%
# filter(row_number() %in% which(v2.metrica.format$labels != v2.metrica.format$predictions))
mutate(.pred = v2.metrica.format$predictions) %>%
mutate(correct = .pred==CL) %>%
select(Code, Name, Country, Year, CL, correct, everything())
v2.incorrect %>% summary()
## Code Name Country Year CL
## Min. : 10101 Length:6555 Brazil :4272 Min. :2001 0:1787
## 1st Qu.: 120608 Class :character Colombia: 317 1st Qu.:2008 1:4768
## Median :1503457 Mode :character Peru :1966 Median :2012
## Mean :1783845 Mean :2012
## 3rd Qu.:2107506 3rd Qu.:2015
## Max. :5300108 Max. :2020
##
## correct NDVI EVI LST_Day
## Mode :logical Min. :0.1767 Min. :0.1016 Min. :11.77
## FALSE:1316 1st Qu.:0.5453 1st Qu.:0.3354 1st Qu.:25.95
## TRUE :5239 Median :0.6172 Median :0.3995 Median :28.82
## Mean :0.5928 Mean :0.3809 Mean :27.89
## 3rd Qu.:0.6722 3rd Qu.:0.4507 3rd Qu.:31.27
## Max. :0.8208 Max. :0.5459 Max. :36.71
##
## min_LST_Day max_LST_Day LST_Night min_LST_Night
## Min. :-8.793 Min. :18.22 Min. :-5.268 Min. :-37.750
## 1st Qu.:17.961 1st Qu.:31.03 1st Qu.:15.358 1st Qu.: 5.081
## Median :23.644 Median :34.80 Median :20.881 Median : 15.916
## Mean :20.851 Mean :34.77 Mean :17.029 Mean : 10.702
## 3rd Qu.:25.447 3rd Qu.:38.58 3rd Qu.:21.779 3rd Qu.: 18.420
## Max. :31.086 Max. :48.22 Max. :24.953 Max. : 22.886
##
## max_LST_Night HNTL Precip Evap_tavg
## Min. :-1.856 Min. : 0.0000 Min. : 184 Min. :1.718e-06
## 1st Qu.:19.443 1st Qu.: 0.1043 1st Qu.:1179 1st Qu.:3.313e-05
## Median :23.256 Median : 0.5435 Median :1636 Median :4.163e-05
## Mean :19.940 Mean : 1.9367 Mean :1702 Mean :3.879e-05
## 3rd Qu.:24.188 3rd Qu.: 2.3949 3rd Qu.:2102 3rd Qu.:4.581e-05
## Max. :28.188 Max. :60.8290 Max. :4831 Max. :6.252e-05
##
## Qair_f_tavg SoilMoi00_10cm_tavg Wind_f_tavg Tair_f_tavg
## Min. :0.004301 Min. :0.1988 Min. :1.412 Min. :273.6
## 1st Qu.:0.011649 1st Qu.:0.3041 1st Qu.:2.354 1st Qu.:294.6
## Median :0.014090 Median :0.3297 Median :3.111 Median :298.9
## Mean :0.013457 Mean :0.3339 Mean :3.169 Mean :295.3
## 3rd Qu.:0.016298 3rd Qu.:0.3618 3rd Qu.:3.874 3rd Qu.:299.8
## Max. :0.018958 Max. :0.4281 Max. :6.758 Max. :302.2
##
## Population min_Precip max_Precip min_Precip_Month
## Min. : 111.6 Min. : 0.0000 Min. : 31.54 Min. : 1.000
## 1st Qu.: 3875.6 1st Qu.: 0.3931 1st Qu.:243.89 1st Qu.: 6.000
## Median : 8904.1 Median : 6.9627 Median :331.79 Median : 7.000
## Mean : 24261.9 Mean : 19.8774 Mean :329.78 Mean : 7.245
## 3rd Qu.: 19207.4 3rd Qu.: 22.2811 3rd Qu.:403.59 3rd Qu.: 8.000
## Max. :2791764.8 Max. :281.0092 Max. :949.69 Max. :12.000
##
## max_Precip_Month forest_density SWChange_abs SWChange_norm
## Min. : 1.000 Min. : 0.00 Min. :-128.0000 Min. :-128.000
## 1st Qu.: 2.000 1st Qu.: 7.28 1st Qu.: -3.6756 1st Qu.: 1.579
## Median : 3.000 Median :23.04 Median : 0.8671 Median : 15.067
## Mean : 4.149 Mean :33.27 Mean : 2.6775 Mean : 21.892
## 3rd Qu.: 5.000 3rd Qu.:57.09 3rd Qu.: 8.1409 3rd Qu.: 39.375
## Max. :12.000 Max. :99.65 Max. : 84.6450 Max. : 100.000
## NA's :85 NA's :85
## SWOccurrence SWRecurrence SWSeasonality max_Elevation
## Min. : 0.00 Min. : 24.00 Min. : 1.000 Min. : 29
## 1st Qu.:41.24 1st Qu.: 77.28 1st Qu.: 5.324 1st Qu.: 349
## Median :60.27 Median : 85.10 Median : 7.271 Median : 646
## Mean :58.09 Mean : 83.49 Mean : 7.172 Mean :1528
## 3rd Qu.:76.43 3rd Qu.: 91.27 3rd Qu.: 9.103 3rd Qu.:2763
## Max. :98.41 Max. :100.00 Max. :11.908 Max. :6544
## NA's :85 NA's :85 NA's :112
## mean_Elevation min_Elevation var_Elevation fold
## Min. : 3.383 Min. : -3.0 Min. : 10.9 Min. :1.000
## 1st Qu.: 170.384 1st Qu.: 82.0 1st Qu.: 901.2 1st Qu.:1.000
## Median : 320.461 Median : 199.0 Median : 4683.5 Median :3.000
## Mean :1006.483 Mean : 631.1 Mean : 89736.7 Mean :2.126
## 3rd Qu.:1125.551 3rd Qu.: 544.0 3rd Qu.: 69556.4 3rd Qu.:3.000
## Max. :4814.186 Max. :4350.0 Max. :2298170.4 Max. :3.000
##
## forest_fragmentation land_use_change edge_loss long
## Min. : 0.0 Min. :-7.70056 Min. :-2677290 Min. :-8863907
## 1st Qu.: 686.8 1st Qu.:-0.25048 1st Qu.: -13590 1st Qu.:-8226324
## Median : 1724.1 Median :-0.01798 Median : 480 Median :-6295076
## Mean : 14365.0 Mean :-0.17601 Mean : 27872 Mean :-6679746
## 3rd Qu.: 5751.9 3rd Qu.: 0.03911 3rd Qu.: 29940 3rd Qu.:-5380194
## Max. :750054.3 Max. : 6.55408 Max. : 4414320 Max. :-4851417
##
## lat .pred
## Min. :-2124011 0:2333
## 1st Qu.:-1435452 1:4222
## Median : -952690
## Mean : -934598
## 3rd Qu.: -506889
## Max. : 649268
##
# there is a 641 to 959 split between true CL absent vs. present
# plots
v2.incorrect.important_vars <- v2.incorrect %>%
select(Code, Name, Year, CL, correct, v2.imp_vars$var) %>%
mutate(log_Population = log(Population)) %>%
select(-Population) %>%
pivot_longer(cols = min_Elevation:log_Population)
ggplot(v2.incorrect.important_vars) +
geom_boxplot(aes(x = correct,
y = value)) +
facet_wrap(facets='name', scales='free')
ggplot(v2.incorrect.important_vars) +
geom_density(aes(x = value,
col = correct)) +
facet_wrap(facets='name', scales='free')
v1.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1323 734
## 1 464 4034
##
## Accuracy : 0.8172
## 95% CI : (0.8077, 0.8265)
## No Information Rate : 0.7274
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.56
##
## Mcnemar's Test P-Value : 7.735e-15
##
## Sensitivity : 0.8461
## Specificity : 0.7403
## Pos Pred Value : 0.8968
## Neg Pred Value : 0.6432
## Prevalence : 0.7274
## Detection Rate : 0.6154
## Detection Prevalence : 0.6862
## Balanced Accuracy : 0.7932
##
## 'Positive' Class : 1
##
v1.auc_out %>% plot()
v1.auc_out$auc
## Area under the curve: 0.7932
v2.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1402 931
## 1 385 3837
##
## Accuracy : 0.7992
## 95% CI : (0.7893, 0.8089)
## No Information Rate : 0.7274
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5379
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8047
## Specificity : 0.7846
## Pos Pred Value : 0.9088
## Neg Pred Value : 0.6009
## Prevalence : 0.7274
## Detection Rate : 0.5854
## Detection Prevalence : 0.6441
## Balanced Accuracy : 0.7946
##
## 'Positive' Class : 1
##
v2.auc_out %>% plot()
v2.auc_out$auc
## Area under the curve: 0.7946
Now we can return our focus to using a weighted approach to choosing
a threshold and predictor variables using
this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]].
Let’s try it on the metrics produced by
balanced_data_v1
For this approach, we get two sets of the top ranked variables based on the AUC and accuracy p-value of their respective models then test them separately.
# create empty vectors to be filled
threshold <- c()
imp.vars <- data.frame(variable = "", auc = "", p_val = "")
auc <- c()
p_val <- c()
for(j in resume_year:max(balanced_data_v1$Year)){
for(i in balanced_data_v1$fold %>% unique() %>% sort()){
# make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))) {next}
this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics/datest/", i, "_", j, "_metrics.rds"))
threshold <- c(threshold, this.metrics$best_threshold[[1]])
imp.vars <- imp.vars %>%
rbind(data.frame(variable = this.metrics$imp_vars, auc = this.metrics$auc$auc[[1]], p_val = this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]))
auc <- c(auc, this.metrics$auc$auc[[1]])
p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
}
}
# get mean threshold
mean.threshold <- mean(threshold)
# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
# get count of each variable to rank general importance to apply to main data
v1.imp.vars.count <- imp.vars.df %>%
count(var) %>%
arrange(desc(n))
# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)
imp.vars.df.ranked <- imp.vars %>%
filter(row_number() > 1) %>%
mutate(auc = as.numeric(auc),
p_val = as.numeric(p_val),
auc.rank = rank(auc),
p_val.rank = rank(p_val)) %>%
group_by(variable) %>%
summarise(mean.auc = mean(auc),
mean.p_val = mean(p_val))
# get auc vars
weighted_v1.auc.vars <- imp.vars.df.ranked %>%
arrange(desc(mean.auc))
# get p-val vars
weighted_v1.p_val.vars <- imp.vars.df.ranked %>%
arrange((mean.p_val))
metrics_df.balanced_v1 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v1
## metric value
## 1 AUC 0.7523907
## 2 Accuracy P-value 0.5261833
## 3 Threshold 0.8011194
# Select the top 10 for each set
weighted_v1.auc.imp_vars <- weighted_v1.auc.vars %>%
filter(row_number() <= 10)
weighted_v1.p_val.imp_vars <- weighted_v1.p_val.vars %>%
filter(row_number() <= 10)
# weighted_v1.datrain/datest -> holdout.train/test
set.seed(123)
## auc variables first ##
weighted_v1.auc.predictor_names <- c(weighted_v1.auc.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
weighted_v1.auc.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v1.auc.predictor_names, collapse=" + ")))
# get testing and training split
# split <- initial_split(data, strata = CL)
# weighted_v1.datrain <- training(split)
# weighted_v1.datest <- testing(split)
weighted_v1.datrain <- balanced.holdout.train
weighted_v1.datest <- balanced.holdout.test
# get model
weighted_v1.auc.mod <- glm(weighted_v1.auc.ml_formula, data=weighted_v1.datrain, family=binomial)
# calculate out of sample model performance on v1.datest
weighted_v1.auc.oob <- predict(weighted_v1.auc.mod, newdata=weighted_v1.datest, type='response', na.action = na.pass)
weighted_v1.auc.temp_auc_out <- pROC::roc(response = weighted_v1.datest$CL, predictor= weighted_v1.auc.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.auc.best_threshold_out <- pROC::coords(weighted_v1.auc.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v1.auc.metrica.format <- data.frame(
cbind(ifelse(weighted_v1.datest$CL==1, 1, 0),
ifelse(weighted_v1.auc.oob>=weighted_v1.auc.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(weighted_v1.auc.metrica.format) <- c("labels","predictions")
rownames(weighted_v1.auc.metrica.format) <- 1:dim(weighted_v1.auc.metrica.format)[1]
weighted_v1.auc.auc_out <- pROC::roc(response = weighted_v1.auc.metrica.format$labels, predictor= weighted_v1.auc.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.auc.metrica.format.confusionMatrix <- weighted_v1.auc.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=weighted_v1.auc.metrica.format$labels)
weighted_v1.auc.sensitivity_out <- caret::recall(data = weighted_v1.auc.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v1.auc.metrica.format.confusionMatrix$table)[2])
weighted_v1.auc.specificity_out <- caret::precision(data = weighted_v1.auc.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v1.auc.metrica.format.confusionMatrix$table)[2])
weighted_v1.auc.var_imp_vars <- vip::vip(weighted_v1.auc.mod)
set.seed(123)
## auc variables first ##
weighted_v1.p_val.predictor_names <- c(weighted_v1.p_val.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
weighted_v1.p_val.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v1.p_val.predictor_names, collapse=" + ")))
# get testing and training split
# split <- initial_split(data, strata = CL)
# weighted_v1.datrain <- training(split)
# weighted_v1.datest <- testing(split)
weighted_1.datrain <- balanced.holdout.train
weighted_v1.datest <- balanced.holdout.test
# get model
weighted_v1.p_val.mod <- glm(weighted_v1.p_val.ml_formula, data=weighted_v1.datrain, family=binomial)
# calculate out of sample model performance on weighted_v1.datest
weighted_v1.p_val.oob <- predict(weighted_v1.p_val.mod, newdata=weighted_v1.datest, type='response', na.action = na.pass)
weighted_v1.p_val.temp_auc_out <- pROC::roc(response = weighted_v1.datest$CL, predictor= weighted_v1.p_val.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.p_val.best_threshold_out <- pROC::coords(weighted_v1.p_val.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v1.p_val.metrica.format <- data.frame(
cbind(ifelse(weighted_v1.datest$CL==1, 1, 0),
ifelse(weighted_v1.p_val.oob>=weighted_v1.p_val.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(weighted_v1.p_val.metrica.format) <- c("labels","predictions")
rownames(weighted_v1.p_val.metrica.format) <- 1:dim(weighted_v1.p_val.metrica.format)[1]
weighted_v1.p_val.auc_out <- pROC::roc(response = weighted_v1.p_val.metrica.format$labels, predictor= weighted_v1.p_val.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v1.p_val.metrica.format.confusionMatrix <- weighted_v1.p_val.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=weighted_v1.p_val.metrica.format$labels)
weighted_v1.p_val.sensitivity_out <- caret::recall(data = weighted_v1.p_val.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v1.p_val.metrica.format.confusionMatrix$table)[2])
weighted_v1.p_val.specificity_out <- caret::precision(data = weighted_v1.p_val.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v1.p_val.metrica.format.confusionMatrix$table)[2])
weighted_v1.p_val.var_imp_vars <- vip::vip(weighted_v1.p_val.mod)
Both models severely underperform when classifying absent points.
# create empty vectors to be filled
threshold <- c()
imp.vars <- data.frame(variable = "", auc = "", p_val = "")
auc <- c()
p_val <- c()
for(j in resume_year:max(balanced_data_v2$Year)){
for(i in balanced_data_v2$fold %>% unique() %>% sort()){
# make sure file exists (we excluded several folds in preliminary analysis due to a class being absent)
if (!file.exists(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))) {next}
this.metrics <- readRDS(paste0("~/peregrine_amazon/Restructured021623/binary_classification/logistic_regression/model_data/st/metrics.v2/datest/", i, "_", j, "_metrics.rds"))
threshold <- c(threshold, this.metrics$best_threshold[[1]])
imp.vars <- imp.vars %>%
rbind(data.frame(variable = this.metrics$imp_vars, auc = this.metrics$auc$auc[[1]], p_val = this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]]))
auc <- c(auc, this.metrics$auc$auc[[1]])
p_val <-c(p_val, this.metrics$confusion_matrix$overall["AccuracyPValue"][[1]])
}
}
# get mean threshold
mean.threshold <- mean(threshold)
# turn imp.vars into data frame
imp.vars.df <- data.frame(var = factor(imp.vars))
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
# get count of each variable to rank general importance to apply to main data
v2.imp.vars.count <- imp.vars.df %>%
count(var) %>%
arrange(desc(n))
# get mean auc
mean.auc <- mean(auc)
# get mean accuracy p-value
mean.p_val <- mean(p_val)
imp.vars.df.ranked <- imp.vars %>%
filter(row_number() > 1) %>%
mutate(auc = as.numeric(auc),
p_val = as.numeric(p_val),
auc.rank = rank(auc),
p_val.rank = rank(p_val)) %>%
group_by(variable) %>%
summarise(mean.auc.rank = mean(auc),
mean.p_val.rank = mean(p_val))
weighted_v2.auc.vars <- imp.vars.df.ranked %>%
arrange(desc(mean.auc.rank))
weighted_v2.p_val.vars <- imp.vars.df.ranked %>%
arrange((mean.p_val.rank))
metrics_df.balanced_v2 <- data.frame(metric = c('AUC', 'Accuracy P-value', 'Threshold'), value = c(mean.auc, mean.p_val, mean.threshold))
metrics_df.balanced_v2
## metric value
## 1 AUC 0.7526243
## 2 Accuracy P-value 0.1034658
## 3 Threshold 0.2897455
weighted_v2.auc.imp_vars <- weighted_v2.auc.vars %>%
filter(row_number() <= 10)
weighted_v2.p_val.imp_vars <- weighted_v2.p_val.vars %>%
filter(row_number() <= 10)
set.seed(123)
## auc variables first ##
weighted_v2.auc.predictor_names <- c(weighted_v2.auc.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
weighted_v2.auc.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v2.auc.predictor_names, collapse=" + ")))
# get testing and training split
split <- initial_split(data, strata = CL)
# weighted_v2.datrain <- training(split)
# weighted_v2.datest <- testing(split)
weighted_v2.datrain <- balanced.holdout.train
weighted_v2.datest <- balanced.holdout.test
# get model
weighted_v2.auc.mod <- glm(weighted_v2.auc.ml_formula, data=weighted_v2.datrain, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# calculate out of sample model performance on weighted_v2.datest
weighted_v2.auc.oob <- predict(weighted_v2.auc.mod, newdata=weighted_v2.datest, type='response', na.action = na.pass)
weighted_v2.auc.temp_auc_out <- pROC::roc(response = weighted_v2.datest$CL, predictor= weighted_v2.auc.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.auc.best_threshold_out <- pROC::coords(weighted_v2.auc.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v2.auc.metrica.format <- data.frame(
cbind(ifelse(weighted_v2.datest$CL==1, 1, 0),
ifelse(weighted_v2.auc.oob>=weighted_v2.auc.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(weighted_v2.auc.metrica.format) <- c("labels","predictions")
rownames(weighted_v2.auc.metrica.format) <- 1:dim(weighted_v2.auc.metrica.format)[1]
weighted_v2.auc.auc_out <- pROC::roc(response = weighted_v2.auc.metrica.format$labels, predictor= weighted_v2.auc.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.auc.metrica.format.confusionMatrix <- weighted_v2.auc.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=weighted_v2.auc.metrica.format$labels)
weighted_v2.auc.sensitivity_out <- caret::recall(data = weighted_v2.auc.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v2.auc.metrica.format.confusionMatrix$table)[2])
weighted_v2.auc.specificity_out <- caret::precision(data = weighted_v2.auc.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v2.auc.metrica.format.confusionMatrix$table)[2])
weighted_v2.auc.var_imp_vars <- vip::vip(weighted_v2.auc.mod)
set.seed(123)
## auc variables first ##
weighted_v2.p_val.predictor_names <- c(weighted_v2.p_val.imp_vars$variable, "forest_density", "forest_fragmentation", "land_use_change", "edge_loss") %>%
unique()
weighted_v2.p_val.ml_formula <- as.formula(paste0("CL ~", paste(weighted_v2.p_val.predictor_names, collapse=" + ")))
# get testing and training split
split <- initial_split(data, strata = CL)
# weighted_v2.datrain <- training(split)
# weighted_v2.datest <- testing(split)
weighted_v2.datrain <- balanced.holdout.train
weighted_v2.datest <- balanced.holdout.test
# get model
weighted_v2.p_val.mod <- glm(weighted_v2.p_val.ml_formula, data=weighted_v2.datrain, family=binomial)
# calculate out of sample model performance on weighted_v2.datest
weighted_v2.p_val.oob <- predict(weighted_v2.p_val.mod, newdata=weighted_v2.datest, type='response', na.action = na.pass)
weighted_v2.p_val.temp_auc_out <- pROC::roc(response = weighted_v2.datest$CL, predictor= weighted_v2.p_val.oob, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.p_val.best_threshold_out <- pROC::coords(weighted_v2.p_val.temp_auc_out, "best", ret = "threshold") # save this threshold; want: take average of threshold across all spatio-temporal folds
weighted_v2.p_val.metrica.format <- data.frame(
cbind(ifelse(weighted_v2.datest$CL==1, 1, 0),
ifelse(weighted_v2.p_val.oob>=weighted_v2.p_val.best_threshold_out[1,1], 1, 0))) %>% # 0.821
mutate_all(as.factor)
colnames(weighted_v2.p_val.metrica.format) <- c("labels","predictions")
rownames(weighted_v2.p_val.metrica.format) <- 1:dim(weighted_v2.p_val.metrica.format)[1]
weighted_v2.p_val.auc_out <- pROC::roc(response = weighted_v2.p_val.metrica.format$labels, predictor= weighted_v2.p_val.metrica.format$predictions %>% as.numeric() - 1, levels=c(0,1), auc = TRUE)
## Setting direction: controls < cases
weighted_v2.p_val.metrica.format.confusionMatrix <- weighted_v2.p_val.metrica.format$predictions %>%
confusionMatrix(positive='1', reference=weighted_v2.p_val.metrica.format$labels)
weighted_v2.p_val.sensitivity_out <- caret::recall(data = weighted_v2.p_val.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v2.p_val.metrica.format.confusionMatrix$table)[2])
weighted_v2.p_val.specificity_out <- caret::precision(data = weighted_v2.p_val.metrica.format.confusionMatrix$table,
relevant = rownames(weighted_v2.p_val.metrica.format.confusionMatrix$table)[2])
weighted_v2.p_val.var_imp_vars <- vip::vip(weighted_v2.p_val.mod)
We open the results section with the confusion matrices and AUCs of all models.
Full variables:
full.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1248 600
## 1 488 4096
##
## Accuracy : 0.8308
## 95% CI : (0.8215, 0.8399)
## No Information Rate : 0.7301
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5793
##
## Mcnemar's Test P-Value : 0.0007649
##
## Sensitivity : 0.8722
## Specificity : 0.7189
## Pos Pred Value : 0.8935
## Neg Pred Value : 0.6753
## Prevalence : 0.7301
## Detection Rate : 0.6368
## Detection Prevalence : 0.7127
## Balanced Accuracy : 0.7956
##
## 'Positive' Class : 1
##
full.auc_out
##
## Call:
## roc.default(response = full.metrica.format$labels, predictor = full.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: full.metrica.format$predictions %>% as.numeric() - 1 in 1736 controls (full.metrica.format$labels 0) < 4696 cases (full.metrica.format$labels 1).
## Area under the curve: 0.7956
First unweighted resampling approach (v1):
v1.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1323 734
## 1 464 4034
##
## Accuracy : 0.8172
## 95% CI : (0.8077, 0.8265)
## No Information Rate : 0.7274
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.56
##
## Mcnemar's Test P-Value : 7.735e-15
##
## Sensitivity : 0.8461
## Specificity : 0.7403
## Pos Pred Value : 0.8968
## Neg Pred Value : 0.6432
## Prevalence : 0.7274
## Detection Rate : 0.6154
## Detection Prevalence : 0.6862
## Balanced Accuracy : 0.7932
##
## 'Positive' Class : 1
##
v1.auc_out
##
## Call:
## roc.default(response = v1.metrica.format$labels, predictor = v1.metrica.format$predictions %>% as.ordered(), levels = c(0, 1), auc = TRUE)
##
## Data: v1.metrica.format$predictions %>% as.ordered() in 1787 controls (v1.metrica.format$labels 0) < 4768 cases (v1.metrica.format$labels 1).
## Area under the curve: 0.7932
Second unweighted resampling approach (v2):
v2.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1402 931
## 1 385 3837
##
## Accuracy : 0.7992
## 95% CI : (0.7893, 0.8089)
## No Information Rate : 0.7274
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5379
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8047
## Specificity : 0.7846
## Pos Pred Value : 0.9088
## Neg Pred Value : 0.6009
## Prevalence : 0.7274
## Detection Rate : 0.5854
## Detection Prevalence : 0.6441
## Balanced Accuracy : 0.7946
##
## 'Positive' Class : 1
##
v2.auc_out
##
## Call:
## roc.default(response = v2.metrica.format$labels, predictor = v2.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: v2.metrica.format$predictions %>% as.numeric() - 1 in 1787 controls (v2.metrica.format$labels 0) < 4768 cases (v2.metrica.format$labels 1).
## Area under the curve: 0.7946
First weighted variables approach (v1):
weighted_v1.auc.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1142 595
## 1 587 4146
##
## Accuracy : 0.8173
## 95% CI : (0.8077, 0.8267)
## No Information Rate : 0.7328
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5342
##
## Mcnemar's Test P-Value : 0.8387
##
## Sensitivity : 0.8745
## Specificity : 0.6605
## Pos Pred Value : 0.8760
## Neg Pred Value : 0.6575
## Prevalence : 0.7328
## Detection Rate : 0.6408
## Detection Prevalence : 0.7315
## Balanced Accuracy : 0.7675
##
## 'Positive' Class : 1
##
weighted_v1.auc.auc_out
##
## Call:
## roc.default(response = weighted_v1.auc.metrica.format$labels, predictor = weighted_v1.auc.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: weighted_v1.auc.metrica.format$predictions %>% as.numeric() - 1 in 1729 controls (weighted_v1.auc.metrica.format$labels 0) < 4741 cases (weighted_v1.auc.metrica.format$labels 1).
## Area under the curve: 0.7675
weighted_v1.p_val.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1044 479
## 1 664 4256
##
## Accuracy : 0.8226
## 95% CI : (0.813, 0.8319)
## No Information Rate : 0.7349
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5284
##
## Mcnemar's Test P-Value : 5.255e-08
##
## Sensitivity : 0.8988
## Specificity : 0.6112
## Pos Pred Value : 0.8650
## Neg Pred Value : 0.6855
## Prevalence : 0.7349
## Detection Rate : 0.6606
## Detection Prevalence : 0.7636
## Balanced Accuracy : 0.7550
##
## 'Positive' Class : 1
##
weighted_v1.p_val.auc_out
##
## Call:
## roc.default(response = weighted_v1.p_val.metrica.format$labels, predictor = weighted_v1.p_val.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: weighted_v1.p_val.metrica.format$predictions %>% as.numeric() - 1 in 1708 controls (weighted_v1.p_val.metrica.format$labels 0) < 4735 cases (weighted_v1.p_val.metrica.format$labels 1).
## Area under the curve: 0.755
Second weighted variables approach (v2):
weighted_v2.auc.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1337 945
## 1 450 3823
##
## Accuracy : 0.7872
## 95% CI : (0.7771, 0.797)
## No Information Rate : 0.7274
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5062
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8018
## Specificity : 0.7482
## Pos Pred Value : 0.8947
## Neg Pred Value : 0.5859
## Prevalence : 0.7274
## Detection Rate : 0.5832
## Detection Prevalence : 0.6519
## Balanced Accuracy : 0.7750
##
## 'Positive' Class : 1
##
weighted_v2.auc.auc_out
##
## Call:
## roc.default(response = weighted_v2.auc.metrica.format$labels, predictor = weighted_v2.auc.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: weighted_v2.auc.metrica.format$predictions %>% as.numeric() - 1 in 1787 controls (weighted_v2.auc.metrica.format$labels 0) < 4768 cases (weighted_v2.auc.metrica.format$labels 1).
## Area under the curve: 0.775
weighted_v2.p_val.metrica.format.confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1078 556
## 1 651 4185
##
## Accuracy : 0.8134
## 95% CI : (0.8037, 0.8229)
## No Information Rate : 0.7328
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5152
##
## Mcnemar's Test P-Value : 0.006817
##
## Sensitivity : 0.8827
## Specificity : 0.6235
## Pos Pred Value : 0.8654
## Neg Pred Value : 0.6597
## Prevalence : 0.7328
## Detection Rate : 0.6468
## Detection Prevalence : 0.7474
## Balanced Accuracy : 0.7531
##
## 'Positive' Class : 1
##
weighted_v2.p_val.auc_out
##
## Call:
## roc.default(response = weighted_v2.p_val.metrica.format$labels, predictor = weighted_v2.p_val.metrica.format$predictions %>% as.numeric() - 1, levels = c(0, 1), auc = TRUE)
##
## Data: weighted_v2.p_val.metrica.format$predictions %>% as.numeric() - 1 in 1729 controls (weighted_v2.p_val.metrica.format$labels 0) < 4741 cases (weighted_v2.p_val.metrica.format$labels 1).
## Area under the curve: 0.7531
Using the weighted variable approaches give very poor performances for predicting absences. The objective “best” performing model is the model using all variables, but close behind are the unweighted variable approaches. These should take the place of the full variable approach since they use less variables, which improves generalizability.
full.incorrect <- full.datest %>%
# filter(row_number() %in% which(full.metrica.format$labels != full.metrica.format$predictions))
mutate(.pred = full.metrica.format$predictions) %>%
mutate(correct = .pred==CL) %>%
select(Code, Name, Country, Year, CL, .pred, correct, everything())
full.incorrect %>% summary()
## Code Name Country Year CL
## Min. : 10101 Length:6554 Brazil :4299 Min. :2001 0:1828
## 1st Qu.: 120807 Class :character Colombia: 339 1st Qu.:2008 1:4726
## Median :1503309 Mode :character Peru :1916 Median :2012
## Mean :1804953 Mean :2011
## 3rd Qu.:2107704 3rd Qu.:2015
## Max. :5300108 Max. :2020
##
## .pred correct NDVI EVI LST_Day
## 0 :1848 Mode :logical Min. :0.1748 Min. :0.09244 Min. :11.86
## 1 :4584 FALSE:1088 1st Qu.:0.5450 1st Qu.:0.33562 1st Qu.:26.11
## NA's: 122 TRUE :5344 Median :0.6193 Median :0.40060 Median :28.69
## NA's :122 Mean :0.5942 Mean :0.38187 Mean :27.87
## 3rd Qu.:0.6751 3rd Qu.:0.45251 3rd Qu.:31.23
## Max. :0.8208 Max. :0.55824 Max. :37.33
##
## min_LST_Day max_LST_Day LST_Night min_LST_Night
## Min. :-15.19 Min. :18.28 Min. :-6.076 Min. :-38.503
## 1st Qu.: 18.13 1st Qu.:31.02 1st Qu.:15.678 1st Qu.: 5.795
## Median : 23.68 Median :34.77 Median :20.865 Median : 15.934
## Mean : 20.86 Mean :34.76 Mean :17.048 Mean : 10.830
## 3rd Qu.: 25.37 3rd Qu.:38.62 3rd Qu.:21.792 3rd Qu.: 18.433
## Max. : 30.41 Max. :48.45 Max. :24.856 Max. : 22.929
##
## max_LST_Night HNTL Precip Evap_tavg
## Min. :-1.545 Min. : 0.0000 Min. : 184 Min. :1.500e-06
## 1st Qu.:19.768 1st Qu.: 0.1057 1st Qu.:1177 1st Qu.:3.289e-05
## Median :23.243 Median : 0.5350 Median :1650 Median :4.175e-05
## Mean :19.932 Mean : 1.9441 Mean :1714 Mean :3.883e-05
## 3rd Qu.:24.194 3rd Qu.: 2.4261 3rd Qu.:2114 3rd Qu.:4.585e-05
## Max. :27.895 Max. :62.4059 Max. :4696 Max. :6.196e-05
##
## Qair_f_tavg SoilMoi00_10cm_tavg Wind_f_tavg Tair_f_tavg
## Min. :0.004105 Min. :0.2003 Min. :1.414 Min. :273.6
## 1st Qu.:0.011753 1st Qu.:0.3043 1st Qu.:2.329 1st Qu.:295.0
## Median :0.014116 Median :0.3302 Median :3.085 Median :298.9
## Mean :0.013484 Mean :0.3349 Mean :3.147 Mean :295.3
## 3rd Qu.:0.016364 3rd Qu.:0.3643 3rd Qu.:3.854 3rd Qu.:299.8
## Max. :0.018908 Max. :0.4276 Max. :6.770 Max. :302.3
##
## Population min_Precip max_Precip min_Precip_Month
## Min. : 218.2 Min. : 0.000 Min. : 31.54 Min. : 1.000
## 1st Qu.: 3896.3 1st Qu.: 0.365 1st Qu.: 245.34 1st Qu.: 6.000
## Median : 9131.3 Median : 6.984 Median : 331.23 Median : 7.000
## Mean : 25640.5 Mean : 20.599 Mean : 330.70 Mean : 7.222
## 3rd Qu.: 19547.5 3rd Qu.: 23.708 3rd Qu.: 407.55 3rd Qu.: 8.000
## Max. :2791764.8 Max. :266.818 Max. :1267.35 Max. :12.000
##
## max_Precip_Month forest_density SWChange_abs SWChange_norm
## Min. : 1.000 Min. : 0.000 Min. :-128.000 Min. :-128.000
## 1st Qu.: 2.000 1st Qu.: 7.266 1st Qu.: -3.184 1st Qu.: 1.497
## Median : 3.000 Median :23.216 Median : 1.110 Median : 14.630
## Mean : 4.147 Mean :33.754 Mean : 2.684 Mean : 21.503
## 3rd Qu.: 5.000 3rd Qu.:57.953 3rd Qu.: 8.046 3rd Qu.: 38.663
## Max. :12.000 Max. :99.628 Max. : 84.645 Max. : 100.000
## NA's :90 NA's :90
## SWOccurrence SWRecurrence SWSeasonality max_Elevation
## Min. : 0.00 Min. : 24.00 Min. : 1.000 Min. : 29
## 1st Qu.:41.98 1st Qu.: 77.46 1st Qu.: 5.340 1st Qu.: 344
## Median :60.37 Median : 85.27 Median : 7.290 Median : 642
## Mean :58.52 Mean : 83.64 Mean : 7.233 Mean :1516
## 3rd Qu.:77.41 3rd Qu.: 91.56 3rd Qu.: 9.199 3rd Qu.:2700
## Max. :98.41 Max. :100.00 Max. :11.908 Max. :6607
## NA's :92 NA's :90 NA's :122
## mean_Elevation min_Elevation var_Elevation fold
## Min. : 3.383 Min. : -3.0 Min. : 10.9 Min. :1.000
## 1st Qu.: 165.504 1st Qu.: 80.0 1st Qu.: 866.5 1st Qu.:1.000
## Median : 315.948 Median : 198.0 Median : 4559.3 Median :3.000
## Mean :1003.001 Mean : 628.7 Mean : 89019.1 Mean :2.126
## 3rd Qu.:1054.828 3rd Qu.: 540.0 3rd Qu.: 66456.5 3rd Qu.:3.000
## Max. :4814.186 Max. :4350.0 Max. :1871815.9 Max. :3.000
##
## forest_fragmentation land_use_change edge_loss long
## Min. : 0.0 Min. :-8.41190 Min. :-2436870 Min. :-8866471
## 1st Qu.: 670.8 1st Qu.:-0.22060 1st Qu.: -13590 1st Qu.:-8211734
## Median : 1724.6 Median :-0.01662 Median : 600 Median :-6313985
## Mean : 16240.9 Mean :-0.16779 Mean : 28823 Mean :-6677677
## 3rd Qu.: 5934.7 3rd Qu.: 0.03734 3rd Qu.: 30360 3rd Qu.:-5389090
## Max. :750054.3 Max. : 5.30915 Max. : 2926320 Max. :-4851417
##
## lat
## Min. :-2124011
## 1st Qu.:-1427949
## Median : -948492
## Mean : -927481
## 3rd Qu.: -500867
## Max. : 649268
##
# plots
full.incorrect.important_vars <- full.incorrect %>%
select(Code, Name, Year, CL, correct, everything()) %>%
mutate(log_Population = log(Population)) %>%
select(-Population) %>%
pivot_longer(cols = NDVI:log_Population)
ggplot(full.incorrect.important_vars) +
geom_boxplot(aes(x = correct,
y = value)) +
facet_wrap(facets='name', scales='free', ncol = 3)
## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).
ggplot(full.incorrect.important_vars) +
geom_density(aes(x = value,
col = correct)) +
facet_wrap(facets='name', scales='free', ncol = 3)
## Warning: Removed 484 rows containing non-finite values (`stat_density()`).
# get a data frame that gives a percentage of how many instances each municipio was classified incorrectly
percent_wrong.df <- full.incorrect %>%
select(Code, Year, Country, correct,
land_use_change, forest_density, forest_fragmentation, edge_loss, mean_Elevation) %>%
group_by(Code) %>% # group by Code when summarizing since number of observations per municipio differ by country and code
mutate(num_years = unique(Year) %>% length(),
num_correct = sum(correct)) %>%
summarise(percent_wrong = 1 - (num_correct/num_years),
median_deforested = median(land_use_change),
max_forest_change = max(forest_density) - min(forest_density),
mean_forest_fragmentation = mean(forest_fragmentation),
mean_edge_loss = mean(edge_loss),
mean_Elevation = mean(mean_Elevation)) %>%
ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'Code'. You can override using the
## `.groups` argument.
geom.df <- readRDS("~/peregrine_amazon/data/annual/aad_2021_forests_geom.rds") %>%
select(Code, geometry) %>%
unique()
percent_wrong.geom.df <- percent_wrong.df %>%
left_join(geom.df, by='Code') %>%
filter(!is.na(percent_wrong)) %>%
sf::st_as_sf() %>%
unique()
percent_wrong.all_incorrect.df <- percent_wrong.geom.df %>%
filter(percent_wrong == 1)
ggplot(percent_wrong.all_incorrect.df) +
geom_density(aes(x=max_forest_change))
percent_wrong.no_correct.df <- percent_wrong.geom.df %>%
filter(percent_wrong > 0)
ggplot(percent_wrong.no_correct.df) +
geom_density(aes(x=max_forest_change))
# maps; change parameters in zcol
mapview::mapview(percent_wrong.all_incorrect.df, zcol = 'mean_forest_fragmentation')
mapview::mapview(percent_wrong.no_correct.df, zcol = 'mean_forest_fragmentation')